#!/usr/bin/env python3
"""
collect_provenance.py — writes:
  - provenance.json
  - results_hashes.json
  - sim_registry_row.csv
Use:
  python collect_provenance.py ^
    --sim_id SIM:V1-PivotFit-002 ^
    --repo_url https://github.com/Kent-Nimmo/vol1-fractal-pivot-calibration ^
    --seed NA ^
    --outputs results\summary.csv ^
    --metrics "..." ^
    --result_summary "..." ^
    --status VERIFIED
"""
import argparse, hashlib, json, os, platform, subprocess, sys, datetime

def sha256_file(path: str) -> str:
    h = hashlib.sha256()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return h.hexdigest()

def sha256_str(s: str) -> str:
    return hashlib.sha256(s.encode("utf-8")).hexdigest()

def detect_commit_hash() -> str | None:
    try:
        out = subprocess.check_output(["git", "rev-parse", "HEAD"], stderr=subprocess.DEVNULL)
        return out.decode("utf-8").strip()
    except Exception:
        return None

def detect_run_env() -> str:
    os_str = platform.platform()
    py = f"Python {platform.python_version()}"
    parts = [os_str, py]
    for mod in ("numpy", "scipy", "pandas"):
        try:
            m = __import__(mod)
            ver = getattr(m, "__version__", "unknown")
            parts.append(f"{mod.capitalize()} {ver}")
        except Exception:
            pass
    container = os.environ.get("CONTAINER_DIGEST")
    if container:
        parts.append(f"container {container}")
    return "; ".join(parts)

def main():
    p = argparse.ArgumentParser()
    p.add_argument("--sim_id", required=True)
    p.add_argument("--repo_url", required=True)
    p.add_argument("--commit_hash", default=None)
    p.add_argument("--seed", default="NA")
    g = p.add_mutually_exclusive_group()
    g.add_argument("--config_path", default=None)
    g.add_argument("--cli_string", default=None)
    p.add_argument("--outputs", nargs="*", default=[])
    p.add_argument("--metrics", default="NA")
    p.add_argument("--result_summary", default="NA (waiting on results)")
    p.add_argument("--date_ran", default=datetime.date.today().isoformat())
    p.add_argument("--status", default="HOLDING", choices=["HOLDING","VERIFIED"])
    p.add_argument("--provenance_out", default="provenance.json")
    p.add_argument("--hashes_out", default="results_hashes.json")
    p.add_argument("--registry_row_out", default="sim_registry_row.csv")
    args = p.parse_args()

    commit = args.commit_hash or detect_commit_hash()
    if not commit or len(commit) != 40:
        print("ERROR: commit_hash not supplied and couldn't auto-detect a 40-char SHA.", file=sys.stderr)
        sys.exit(2)

    if args.config_path:
        if not os.path.isfile(args.config_path):
            print(f"ERROR: config_path not found: {args.config_path}", file=sys.stderr)
            sys.exit(2)
        config_ref = args.config_path
        config_hash = sha256_file(args.config_path)
    elif args.cli_string:
        config_ref = args.cli_string
        config_hash = sha256_str(args.cli_string)
    else:
        config_ref, config_hash = "NA", "NA"

    outputs_hashes = {}
    for path in args.outputs:
        if not os.path.isfile(path):
            print(f"ERROR: output file not found: {path}", file=sys.stderr)
            sys.exit(3)
        outputs_hashes[path] = "sha256:" + sha256_file(path)

    run_env = detect_run_env()

    record = {
        "sim_id": args.sim_id,
        "repo_url": args.repo_url,
        "commit_hash": commit,
        "seed": args.seed,
        "config_ref": config_ref,
        "config_hash": config_hash,
        "run_env": run_env,
        "metrics": args.metrics,
        "result_summary": args.result_summary,
        "date_ran": args.date_ran,
        "status": args.status,
        "outputs_hashes": outputs_hashes or "NA"
    }

    with open(args.provenance_out, "w", encoding="utf-8") as f:
        json.dump(record, f, indent=2)
    with open(args.hashes_out, "w", encoding="utf-8") as f:
        json.dump(outputs_hashes, f, indent=2)

    header = "sim_id,repo_url,commit_hash,seed,config_ref,config_hash,run_env,metrics,result_summary,date_ran,status,outputs_hashes\n"
    def csv_escape(s: str) -> str:
        return "\"" + s.replace("\"", "\"\"") + "\"" if any(c in s for c in [",","\"","\n"]) else s
    outputs_str = "NA" if not outputs_hashes else ";".join(f"{k}:{v}" for k,v in outputs_hashes.items())
    row = ",".join(map(csv_escape, [
        record["sim_id"], record["repo_url"], record["commit_hash"], str(record["seed"]),
        record["config_ref"], record["config_hash"], record["run_env"], record["metrics"],
        record["result_summary"], record["date_ran"], record["status"], outputs_str
    ])) + "\n"
    write_header = not os.path.exists(args.registry_row_out)
    with open(args.registry_row_out, "a", encoding="utf-8") as f:
        if write_header: f.write(header)
        f.write(row)
    print(f"Wrote {args.provenance_out}, {args.hashes_out}, and {args.registry_row_out}.")

if __name__ == "__main__":
    main()
